

# to manipulate data
library(tidyverse)
library(lubridate)
library(gdata)


# load workspaces with topic proportions, dictionary results
load("workspace_topic_models_unwaived.Rdata")
load("workspace_dictionary_methods_unwaived.Rdata")

# to use up less space, just keep the dataframe with topic proportions
keep(unwaived,results_unwaived,results_unwaived_dictionary, sure = T)


# load covariates (to merge with topic proportions)
load("covariates.RData")



####### TOPIC PROPORTIONS
# calculate average topic proportions for each agreement
results_unwaived2 <- results_unwaived %>%
  group_by(iso3c, id) %>%
  mutate(across(topic1_natres, list(full = ~ mean(.)))) %>%
  ungroup %>%
  select(iso3c,begin,end,duration,topic1_natres_full) %>%
  unique()

# now stretch them to every country-year under agreement
results_unwaived3 <- results_unwaived2 %>%
  select(iso3c, begin, end) %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols      = begin:end,
               names_to  = "start_end",
               values_to = "created") %>% 
  select(-"start_end") %>% 
  group_by(iso3c, id) %>% 
  mutate(created = as.Date(created)) %>%
  complete(created = seq.Date(min(created),max(created), by = "year")) %>% 
  separate(created, into = c("year", "month", "day"), sep = "-", remove = FALSE) %>% 
  mutate(year  = as.numeric(year),
         month = as.numeric(month),
         day   = as.numeric(day),
         begin = min(created),
         end   = max(created)) %>% 
  ungroup() %>% 
  select(-id) %>% 
  left_join(results_unwaived2, by = c("iso3c", "begin", "end")) %>%
  filter(duration>1 & year>1978) %>%
  unique()

# check for country-year duplicates (after all, countries might be under multiple agreements in one single year)
test <- results_unwaived3 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) %>%
  ungroup() #  400 duplicates

# we only want to use the agreement (and the corresponding conditions) that covered most days of the year
# we don't care about an agreement that covered 10 days of the year as much as we care about an agreement that covered 10 months, for example
test <- test %>%
  mutate(begin_year = year==year(begin),
         end_year = year==year(end),
         first_day = as.Date(ISOdate(test$year, 1, 1)),
         last_day = as.Date(ISOdate(test$year, 12, 31)),
         begin_last_day = ifelse(begin_year==TRUE, day(as.period(begin %--% last_day, unit="day")), NA),
         end_first_day = ifelse(end_year==TRUE, day(as.period(first_day %--% end, unit="day")), NA)) %>%
  group_by(iso3c,year) %>%
  mutate(keep = ifelse(!is.na(begin_last_day), begin_last_day,end_first_day),
         keep = ifelse(is.na(keep), 365, keep)) %>%
  slice(which.max(keep)) %>%
  ungroup() %>% 
  select(-c(begin_year,end_year,first_day,last_day,begin_last_day,end_first_day,keep))

results_unwaived4 <- results_unwaived3 %>%
  group_by(iso3c,year) %>%
  filter(n()<2) %>%
  ungroup()

# now bring it all together
results_unwaived4 <- rbind(results_unwaived4,test) %>%
  select(iso3c,year,topic1_natres_full)

results_unwaived4 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) # no country-year duplicates now!




####### DICTIONARY ANALYSIS - ABSOLUTE
# calculate total number of absolute mentions for each agreement
results_unwaived_dictionary2 <- results_unwaived_dictionary %>%
  group_by(id) %>%
  mutate(across(resources_abs, list(total = ~ sum(.)))) %>%
  ungroup %>%
  select(iso3c,begin,end,duration,resources_abs_total) %>%
  unique()


# now stretch them to every country-year under agreement
results_unwaived_dictionary3 <- results_unwaived_dictionary2 %>%
  select(iso3c, begin, end) %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols      = begin:end,
               names_to  = "start_end",
               values_to = "created") %>% 
  select(-"start_end") %>% 
  group_by(iso3c, id) %>% 
  mutate(created = as.Date(created)) %>%
  complete(created = seq.Date(min(created),max(created), by = "year")) %>% 
  separate(created, into = c("year", "month", "day"), sep = "-", remove = FALSE) %>% 
  mutate(year  = as.numeric(year),
         month = as.numeric(month),
         day   = as.numeric(day),
         begin = min(created),
         end   = max(created)) %>% 
  ungroup() %>% 
  select(-id) %>% 
  left_join(results_unwaived_dictionary2, by = c("iso3c", "begin", "end")) %>%
  filter(duration>1 & year>1978) %>%
  unique()

# check for country-year duplicates (after all, countries might be under multiple agreements in one single year)
test <- results_unwaived_dictionary3 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) %>%
  ungroup() #  400 duplicates

# we only want to use the agreement (and the corresponding conditions) that covered most days of the year
# we don't care about an agreement that covered 10 days of the year as much as we care about an agreement that covered 10 months, for example
test <- test %>%
  mutate(begin_year = year==year(begin),
         end_year = year==year(end),
         first_day = as.Date(ISOdate(test$year, 1, 1)),
         last_day = as.Date(ISOdate(test$year, 12, 31)),
         begin_last_day = ifelse(begin_year==TRUE, day(as.period(begin %--% last_day, unit="day")), NA),
         end_first_day = ifelse(end_year==TRUE, day(as.period(first_day %--% end, unit="day")), NA)) %>%
  group_by(iso3c,year) %>%
  mutate(keep = ifelse(!is.na(begin_last_day), begin_last_day,end_first_day),
         keep = ifelse(is.na(keep), 365, keep)) %>%
  slice(which.max(keep)) %>%
  ungroup() %>% 
  select(-c(begin_year,end_year,first_day,last_day,begin_last_day,end_first_day,keep))

results_unwaived_dictionary4 <- results_unwaived_dictionary3 %>%
  group_by(iso3c,year) %>%
  filter(n()<2) %>%
  ungroup()

# now bring it all together
results_unwaived_dictionary4 <- rbind(results_unwaived_dictionary4,test) %>%
  select(iso3c,year,resources_abs_total)

results_unwaived_dictionary4 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) # no country-year duplicates now!



####### DICTIONARY ANALYSIS - RELATIVE (TF-IDF)
# calculate average tf-idf for each agreement
results_unwaived_dictionary_tfidf2 <- results_unwaived_dictionary %>%
  group_by(id) %>%
  mutate(across(resources_tfidf, list(total = ~ mean(.)))) %>%
  ungroup %>%
  select(iso3c,begin,end,duration,resources_tfidf_total) %>%
  unique()


# now stretch them to every country-year under agreement
results_unwaived_dictionary_tfidf3 <- results_unwaived_dictionary_tfidf2 %>%
  select(iso3c, begin, end) %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols      = begin:end,
               names_to  = "start_end",
               values_to = "created") %>% 
  select(-"start_end") %>% 
  group_by(iso3c, id) %>% 
  mutate(created = as.Date(created)) %>%
  complete(created = seq.Date(min(created),max(created), by = "year")) %>% 
  separate(created, into = c("year", "month", "day"), sep = "-", remove = FALSE) %>% 
  mutate(year  = as.numeric(year),
         month = as.numeric(month),
         day   = as.numeric(day),
         begin = min(created),
         end   = max(created)) %>% 
  ungroup() %>% 
  select(-id) %>% 
  left_join(results_unwaived_dictionary_tfidf2, by = c("iso3c", "begin", "end")) %>%
  filter(duration>1 & year>1978) %>%
  unique()

# check for country-year duplicates (after all, countries might be under multiple agreements in one single year)
test <- results_unwaived_dictionary_tfidf3 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) %>%
  ungroup() #  400 duplicates

# we only want to use the agreement (and the corresponding conditions) that covered most days of the year
# we don't care about an agreement that covered 10 days of the year as much as we care about an agreement that covered 10 months, for example
test <- test %>%
  mutate(begin_year = year==year(begin),
         end_year = year==year(end),
         first_day = as.Date(ISOdate(test$year, 1, 1)),
         last_day = as.Date(ISOdate(test$year, 12, 31)),
         begin_last_day = ifelse(begin_year==TRUE, day(as.period(begin %--% last_day, unit="day")), NA),
         end_first_day = ifelse(end_year==TRUE, day(as.period(first_day %--% end, unit="day")), NA)) %>%
  group_by(iso3c,year) %>%
  mutate(keep = ifelse(!is.na(begin_last_day), begin_last_day,end_first_day),
         keep = ifelse(is.na(keep), 365, keep)) %>%
  slice(which.max(keep)) %>%
  ungroup() %>% 
  select(-c(begin_year,end_year,first_day,last_day,begin_last_day,end_first_day,keep))

results_unwaived_dictionary_tfidf4 <- results_unwaived_dictionary_tfidf3 %>%
  group_by(iso3c,year) %>%
  filter(n()<2) %>%
  ungroup()

# now bring it all together
results_unwaived_dictionary_tfidf4 <- rbind(results_unwaived_dictionary_tfidf4,test) %>%
  select(iso3c,year,resources_tfidf_total)

results_unwaived_dictionary_tfidf4 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) # no country-year duplicates now!



####### MERGE
full_df <- results_unwaived4 %>%
  full_join(covariates) %>%
  full_join(results_unwaived_dictionary4) %>%
  full_join(results_unwaived_dictionary_tfidf4) %>%
  group_by(iso3c,country) %>%
  arrange(year, .by_group = T) %>%
  ungroup() %>%
  filter(year<2020) %>%
  select(country,iso3c,region,income,year,stabilization_law:pension_law,imf_program,topic1_natres_full,
         resources_abs_total,resources_tfidf_total,
         imf_program_new:nationalization_oc) %>%
  mutate_at(vars(imf_program:imf_program_last_year), ~replace_na(., 0)) %>%
  rename(resource_conditionality = topic1_natres_full,
         resource_conditionality_dict_abs = resources_abs_total,
         resource_conditionality_dict_tfidf = resources_tfidf_total)

# generate a time trend
time_trend <- data.frame(year = 1979:2019, t = 1:41)

# generate short-term vs long-term variables
full_df <- full_df %>%
  mutate(short_term = ifelse(stabilization_law==1 | investment_law==1 | development_law==1, 1, 0),
         long_term = ifelse(savings_law==1 | pension_law==1, 1, 0),
         any_fund = ifelse(short_term==1 | long_term==1, 1, 0),
         iso3c = as.factor(iso3c)) %>%
  group_by(iso3c) %>%
  arrange(year, .by_group = T) %>%
  mutate(across(short_term:any_fund, list(lead = ~ lead(.,1))),
         across(resource_conditionality, list(lead = ~ lead(.,1))),
         cumulative_short_term = cumsum(lag(short_term, n = 1, default = 0)),
         previous_short_term  = ifelse(cumulative_short_term>0, 1, 0),
         cumulative_long_term = cumsum(lag(long_term, n = 1, default = 0)),
         previous_long_term  = ifelse(cumulative_long_term>0, 1, 0)) %>%
  ungroup() %>%
  mutate(log_gdp_pc_constant = log(gdp_pc_constant),
         iso3c = as.factor(iso3c),
         any_war = ifelse(independence_war_mepv==1 | international_war_mepv==1 | civil_war_mepv==1 | ethnic_var_mepv==1, 1, 0),
         any_war = ifelse(is.na(any_war), 0, any_war),
         wb_extractive_project = ifelse(is.na(wb_extractive_project), 0, 1),
         left_executive = as.numeric(left_executive),
         election = as.numeric(election),
         nationalization_oc = as.numeric(nationalization_oc),
         log_external_debt_stocks = log(external_debt_stocks),
         field_discovery = ifelse(is.na(field_discovery), 0, field_discovery)) %>%
  left_join(time_trend) %>% # add time trend
  rename(ethnic_war_mepv = ethnic_var_mepv)


save(full_df, file = "full_data_conditionality_covariates_unwaived.RData")  
